Clean up the Data
library(readr)
library(dplyr)
library(stringr)
df <- read.csv('raw_data/coffee_compare.csv')
coffee <- df %>% select(DBA, reinspections, checks, violations,
score, inspections, BORO, SCORE)
coffee$DBA = ifelse(str_detect(coffee$DBA, "DUNKIN"), 'DD', 'Starbucks')
coffee %>%
select(BORO, DBA) %>%
group_by(BORO) %>%
table()
## DBA
## BORO DD Starbucks
## BRONX 92 11
## BROOKLYN 139 44
## MANHATTAN 170 242
## QUEENS 188 42
## STATEN ISLAND 36 7
Visualization
library(ggthemes)
library(ggplot2)
library(plotly)
coffee_new <- coffee %>%
group_by(DBA, BORO) %>%
summarize(Value = n())
pc <- ggplot(coffee_new, aes(fill=DBA, y=Value, x=BORO)) +
geom_bar(position="dodge", stat="identity", width = 0.5) +
xlab('Neighborhood') +
ylab('Health Violations') +
labs(caption = 'Data Source: DOHMH',
fill = 'Brands') +
ggtitle('Health Violations of Coffee Brands by Neighborhoods') +
theme_bw() +
scale_fill_manual(values = c("#f09a56", "#87dc97")) +
theme(plot.title = element_text(size=12, face="bold", hjust = 0.5),
legend.text = element_text(size=8),
legend.title = element_text(size=8))
pc

pc1 <- ggplot(coffee, aes(x = BORO, y = violations, color = DBA)) +
geom_point(alpha = 0.5) +
xlab('Neighborhood') +
ylab('Health Violations') +
labs(caption = 'DOHMH') +
ggtitle('Health Violations of Coffee Brands by Neighborhoods') +
theme_bw() +
theme(plot.title = element_text(size=12, face="bold", hjust = 0.5),
legend.text = element_text(size=8),
legend.title = element_text(size=8)) +
scale_color_manual(values = c("DD" = "#f09a56",
'Starbucks' = "#87dc97"))
ggplotly(pc1)
coffee_new <- coffee %>%
group_by(DBA, BORO) %>%
summarize(total_vio = sum(violations, na.rm = T))
pc11 <- ggplot(coffee_new,
aes(fill=DBA, y=total_vio, x=BORO)) +
geom_bar(position="dodge", stat="identity", width = 0.5) +
xlab('Neighborhood') +
ylab('Health Violations') +
labs(caption = 'Data Source: DOHMH',
fill = 'Brands') +
ggtitle('Health Violations of Coffee Brands by Neighborhoods') +
theme_bw() +
scale_fill_manual(values = c("#f09a56", "#87dc97")) +
theme(plot.title = element_text(size=12, face="bold", hjust = 0.5),
legend.text = element_text(size=8),
legend.title = element_text(size=8))
pc11

pc2 <- ggplot(coffee, aes(x = BORO, y = SCORE, color = DBA)) +
geom_point(alpha = 0.5) +
xlab('Neighborhood') +
ylab('Total Score for a Particular Inspection') +
labs(caption = 'DOHMH') +
ggtitle('Score of Coffee Brands by Neighborhoods') +
theme_bw() +
theme(plot.title = element_text(size=12, face="bold", hjust = 0.5),
legend.text = element_text(size=8),
legend.title = element_text(size=8)) +
scale_color_manual(values = c("DD" = "#f09a56",
'Starbucks' = "#87dc97"))
ggplotly(pc2)
coffee_new <- coffee %>%
group_by(DBA, BORO) %>%
summarize(total_score = sum(SCORE, na.rm = T))
pc11 <- ggplot(coffee_new,
aes(fill=DBA, y=total_score, x=BORO)) +
geom_bar(position="dodge", stat="identity", width = 0.5) +
xlab('Neighborhood') +
ylab('Total Score for a Particular Inspection') +
labs(caption = 'Data Source: DOHMH',
fill = 'Brands') +
ggtitle('Inspection Score of Coffee Brands by Neighborhoods') +
theme_bw() +
scale_fill_manual(values = c("#f09a56", "#87dc97")) +
theme(plot.title = element_text(size=12, face="bold", hjust = 0.5),
legend.text = element_text(size=8),
legend.title = element_text(size=8))
pc11

Linear Regression Analysis
coffee_lm <- coffee %>%
group_by(DBA, BORO) %>%
summarize(total_num = n(),
total_vio = sum(violations, na.rm = T),
total_score = sum(SCORE, na.rm = T))
coffee_lm$DBA = ifelse(coffee_lm$DBA == "DD", 1, 0)
coffee_sb <- coffee_lm %>% filter(DBA == 1)
coffee_dd <- coffee_lm %>% filter(DBA == 0)
lm <- lm(total_num ~ total_vio + total_score,
data = coffee_sb)
summary(lm)
##
## Call:
## lm(formula = total_num ~ total_vio + total_score, data = coffee_sb)
##
## Residuals:
## 1 2 3 4 5
## -3.7532 1.8681 7.7430 -5.9649 0.1071
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.04674 8.97407 -0.674 0.570
## total_vio 0.19675 0.09361 2.102 0.170
## total_score -0.10594 0.11825 -0.896 0.465
##
## Residual standard error: 7.521 on 2 degrees of freedom
## Multiple R-squared: 0.9926, Adjusted R-squared: 0.9851
## F-statistic: 133.4 on 2 and 2 DF, p-value: 0.007442
lm <- lm(total_num ~ total_vio + total_score,
data = coffee_dd)
summary(lm)
##
## Call:
## lm(formula = total_num ~ total_vio + total_score, data = coffee_dd)
##
## Residuals:
## 1 2 3 4 5
## 2.9596 0.4643 0.2344 -1.9530 -1.7052
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.13450 1.62027 0.700 0.556
## total_vio 0.14931 0.09474 1.576 0.256
## total_score -0.01807 0.10081 -0.179 0.874
##
## Residual standard error: 2.806 on 2 degrees of freedom
## Multiple R-squared: 0.9996, Adjusted R-squared: 0.9992
## F-statistic: 2443 on 2 and 2 DF, p-value: 0.0004092
lm1 <- lm(total_vio ~ total_num,
data = coffee_sb)
summary(lm1)
##
## Call:
## lm(formula = total_vio ~ total_num, data = coffee_sb)
##
## Residuals:
## 1 2 3 4 5
## 69.29 -51.87 -44.06 48.47 -21.82
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 78.8885 70.8358 1.114 0.346613
## total_num 8.7481 0.5185 16.872 0.000453 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 63.92 on 3 degrees of freedom
## Multiple R-squared: 0.9896, Adjusted R-squared: 0.9861
## F-statistic: 284.7 on 1 and 3 DF, p-value: 0.0004534
lm1 <- lm(total_vio ~ total_num,
data = coffee_dd)
summary(lm1)
##
## Call:
## lm(formula = total_vio ~ total_num, data = coffee_dd)
##
## Residuals:
## 1 2 3 4 5
## -21.165 -6.443 -1.107 16.665 12.050
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.92727 9.93968 -0.798 0.483
## total_num 7.55386 0.08895 84.922 3.6e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.45 on 3 degrees of freedom
## Multiple R-squared: 0.9996, Adjusted R-squared: 0.9994
## F-statistic: 7212 on 1 and 3 DF, p-value: 3.599e-06
lm2 <- lm(total_vio ~ total_num*DBA,
data = coffee_lm)
summary(lm2)
##
## Call:
## lm(formula = total_vio ~ total_num * DBA, data = coffee_lm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -51.873 -21.656 -3.775 15.511 69.287
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.9273 26.6871 -0.297 0.7764
## total_num 7.5539 0.2388 31.630 6.64e-08 ***
## DBA 86.8158 58.3784 1.487 0.1875
## total_num:DBA 1.1942 0.4489 2.661 0.0375 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 46.85 on 6 degrees of freedom
## Multiple R-squared: 0.997, Adjusted R-squared: 0.9956
## F-statistic: 674.2 on 3 and 6 DF, p-value: 5.653e-08
lm2 <- lm(total_score ~ total_num*DBA,
data = coffee_lm)
summary(lm2)
##
## Call:
## lm(formula = total_score ~ total_num * DBA, data = coffee_lm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -79.691 -23.202 -4.934 23.618 95.573
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.2836 32.3629 -0.132 0.899
## total_num 7.0966 0.2896 24.503 3.04e-07 ***
## DBA 84.9386 70.7942 1.200 0.275
## total_num:DBA -0.2186 0.5443 -0.402 0.702
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 56.82 on 6 degrees of freedom
## Multiple R-squared: 0.9939, Adjusted R-squared: 0.9909
## F-statistic: 327.5 on 3 and 6 DF, p-value: 4.881e-07
Supervised Machine Learning
Question: Whether can be told the coffee store is Starbucks or not?
Binary outcome in this case.
library(caret)
coffee$DBA = ifelse(coffee$DBA == "DD", 0, 1)
coffee$DBA <- factor(coffee$DBA,
labels = c("Starbucks", "DD"),
levels = 1:0)
set.seed(12345)
in_train <- createDataPartition(y = coffee$DBA,
p = 0.8, list = FALSE)
training <- coffee[ in_train, ]
testing <- coffee[-in_train, ]
logit <- glm(DBA ~ checks + violations + score + BORO,
data = training, family = binomial(link = "logit"))
y_hat_logit <- predict(logit, newdata = testing, type = "response")
z_logit <- factor(y_hat_logit > 0.5,
levels = c(TRUE, FALSE),
labels = c("Starbucks", "DD"))
confusionMatrix(z_logit, reference = testing$DBA)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Starbucks DD
## Starbucks 34 101
## DD 35 24
##
## Accuracy : 0.299
## 95% CI : (0.2355, 0.3687)
## No Information Rate : 0.6443
## P-Value [Acc > NIR] : 1
##
## Kappa : -0.2596
##
## Mcnemar's Test P-Value : 2.494e-08
##
## Sensitivity : 0.4928
## Specificity : 0.1920
## Pos Pred Value : 0.2519
## Neg Pred Value : 0.4068
## Prevalence : 0.3557
## Detection Rate : 0.1753
## Detection Prevalence : 0.6959
## Balanced Accuracy : 0.3424
##
## 'Positive' Class : Starbucks
##
LDA <- train(DBA ~ checks + violations + score + BORO,
data = training, method = "lda",
reProcess = c("center", "scale"))
z <- predict(LDA, newdata = testing)
confusionMatrix(z, testing$DBA)
## Confusion Matrix and Statistics
##
## Reference
## Prediction Starbucks DD
## Starbucks 38 26
## DD 31 99
##
## Accuracy : 0.7062
## 95% CI : (0.6367, 0.7693)
## No Information Rate : 0.6443
## P-Value [Acc > NIR] : 0.04083
##
## Kappa : 0.3484
##
## Mcnemar's Test P-Value : 0.59624
##
## Sensitivity : 0.5507
## Specificity : 0.7920
## Pos Pred Value : 0.5938
## Neg Pred Value : 0.7615
## Prevalence : 0.3557
## Detection Rate : 0.1959
## Detection Prevalence : 0.3299
## Balanced Accuracy : 0.6714
##
## 'Positive' Class : Starbucks
##